This notebook is based off the original fragment detection notebook, but specific to detection of participle phrase fragments. As our trainin g data we will use a datafile of 23,471 sentences with a participle phrase contained in them at the begining, middle, or end of the sentence, and 23,471 partiple phrases extracted from the sentences -- these raw participle phrases will always be fragments.The labels will be either a 1 or 0, where 1 indicates a partiple phrase fragment and 0 indicates that it is NOT a participle phrase fragment.
In [ ]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tflearn
from tflearn.data_utils import to_categorical
import spacy
nlp = spacy.load('en_core_web_lg')
import re
from nltk.util import ngrams, trigrams
import csv
In [ ]:
import subprocess
subprocess.Popen("python combine.py childrens_fragments2".split(), cwd='../data/fragments/participle-phrases')
In [ ]:
texts = []
labels = []
with open("../data/fragments/participle-phrases/childrens_fragments2.combined.txt","r") as f:
for i, sentence_or_fragment in enumerate(f):
if i % 2 == 0:
labels.append(0)
else:
labels.append(1)
texts.append(sentence_or_fragment.strip())
print(texts[-10:])
In [ ]:
import random
combined = list(zip(texts,labels))
random.shuffle(combined)
texts[:], labels[:] = zip(*combined)
print(texts[-10:])
print(labels[-10:])
In [ ]:
def textStringToPOSArray(text):
doc = nlp(text)
tags = []
for word in doc:
tags.append(word.tag_)
return tags
textStringToPOSArray(texts[3])
In [ ]:
def find_ngrams(input_list, n):
return zip(*[input_list[i:] for i in range(n)])
def getPOSTrigramsForTextString(text):
tags = textStringToPOSArray(text)
tgrams = list(trigrams(tags))
return tgrams
print("Text: ", texts[3], labels[3])
getPOSTrigramsForTextString(texts[3])
In [ ]:
def trigramsToDictKeys(trigrams):
keys = []
for trigram in trigrams:
keys.append('>'.join(trigram))
return keys
print(texts[2])
print(trigramsToDictKeys(getPOSTrigramsForTextString(texts[2])))
In [ ]:
from collections import Counter
c = Counter()
for textString in texts:
c.update(trigramsToDictKeys(getPOSTrigramsForTextString(textString)))
total_counts = c
print("Total words in data set: ", len(total_counts))
In [ ]:
vocab = sorted(total_counts, key=total_counts.get, reverse=True)
print(vocab[:60])
In [ ]:
print(vocab[-1], ': ', total_counts[vocab[-1]])
Take the trigrams and index them
In [ ]:
word2idx = {n: i for i, n in enumerate(vocab)}## create the word-to-index dictionary here
print(word2idx)
In [ ]:
def textToTrigrams(text):
return trigramsToDictKeys(getPOSTrigramsForTextString(text))
def text_to_vector(text):
wordVector = np.zeros(len(vocab))
for word in textToTrigrams(text):
index = word2idx.get(word, None)
if index != None:
wordVector[index] += 1
return wordVector
In [ ]:
text_to_vector('Donald, standing on the precipice, began to dance.')[:65]
In [ ]:
word_vectors = np.zeros((len(texts), len(vocab)), dtype=np.int_)
for ii, text in enumerate(texts):
word_vectors[ii] = text_to_vector(text)
In [ ]:
# Printing out the first 5 word vectors
word_vectors[:5, :23]
In [ ]:
records = len(labels)
test_fraction = 0.9
train_split, test_split = int(records*test_fraction), int(records*(1-test_fraction))
print(train_split, test_split)
trainX, trainY = word_vectors[:train_split], to_categorical(labels[:train_split], 2)
testX, testY = word_vectors[test_split:], to_categorical(labels[test_split:], 2)
In [ ]:
trainX[-1], trainY[-1]
In [ ]:
len(trainY), len(testY), len(trainY) + len(testY)
In [ ]:
# Network building
def build_model():
# This resets all parameters and variables, leave this here
tf.reset_default_graph()
#### Your code ####
net = tflearn.input_data([None, len(vocab)]) # Input
net = tflearn.fully_connected(net, 200, activation='ReLU') # Hidden
net = tflearn.fully_connected(net, 25, activation='ReLU') # Hidden
net = tflearn.fully_connected(net, 2, activation='softmax') # Output
net = tflearn.regression(net, optimizer='sgd', learning_rate=0.1, loss='categorical_crossentropy')
model = tflearn.DNN(net)
return model
In [ ]:
len(vocab)
In [ ]:
model = build_model()
In [ ]:
# Training
model.fit(trainX, trainY, validation_set=0.1, show_metric=True, batch_size=128, n_epoch=50)
In [ ]:
# Testing
predictions = (np.array(model.predict(testX))[:,0] >= 0.5).astype(np.int_)
test_accuracy = np.mean(predictions == testY[:,0], axis=0)
print("Test accuracy: ", test_accuracy)
In [ ]:
w = csv.writer(open("../models/participlevocabindex2.csv", "w"))
for key, val in word2idx.items():
w.writerow([key, val])
In [ ]:
model.save("../models/participle_model2.tfl")
In [ ]:
def test_sentence(sentence):
positive_prob = model.predict([text_to_vector(sentence)])[0][1]
print('Is this a participle phrase fragment?\n {}'.format(sentence))
print('P(positive) = {:.3f} :'.format(positive_prob),
'Yes' if positive_prob > 0.5 else 'No')
In [ ]:
test_sentence("Neglecting to recognize the horrors those people endure allow people to go to war more easily.")
In [ ]:
test_sentence("Katherine, gesticulating wildly and dripping in sweat, kissed him on the cheek.")
In [ ]:
test_sentence("Working far into the night in an effort to salvage her little boat.")
In [ ]:
test_sentence("Working far into the night in an effort to salvage her little boat, she slowly grew tired.")
In [ ]:
test_sentence("Rushing to the rescue with his party.")
In [ ]:
test_sentence("Isobel was about thirteen now, and as pretty a girl, according to Buzzby, as you could meet with in any part of Britain.")
In [ ]:
test_sentence("Being of a modest and retiring disposition, Mr. Hawthorne avoided publicity.")
In [ ]:
test_sentence("Clambering to the top of a bridge, he observed a great rainbow")
In [ ]:
test_sentence("Clambering to the top of a bridge.")
In [ ]:
test_sentence("He observed a great rainbow.")
In [ ]:
test_sentence("Sitting on the iron throne, Joffry looked rather fat.")
In [ ]:
test_sentence("Worrying that a meteor or chunk of space debris will conk her on the head.")
In [ ]:
test_sentence("Aunt Olivia always wears a motorcycle helmet, worrying that a meteor or chunk of space debris will conk her on the head")
In [ ]:
test_sentence("Affecting the lives of many students in New York City.")
In [ ]:
test_sentence("Quill was a miracle, affecting the lives of many students in New York City.")
In [ ]:
test_sentence("Standing on the edge of the cliff looking down.")
In [ ]:
test_sentence("Emilia, standing on the edge of the cliff and looking down, began to weep.")
In [ ]:
test_sentence("Standing on the edge of the cliff and looking down, Emilia began to weep.")
In [ ]:
test_sentence("Tired and needing sleep.")
Save the vocab
In [ ]:
vocab
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: